<!DOCTYPE html>
<html lang="zh-CN">

<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
  <meta name="theme-color" content="#222">
  <meta name="generator" content="Hexo 4.2.1">
  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png">
  <link rel="mask-icon" href="/images/safari-pinned-tab.svg" color="#222">
  <link rel="stylesheet" href="/css/main.css">
  <link rel="stylesheet" href="/lib/font-awesome/css/all.min.css">
  <link rel="stylesheet" href="/lib/pace/pace-theme-minimal.min.css">
  <script src="/lib/pace/pace.min.js"></script>
  <script id="hexo-configurations">
    var NexT = window.NexT ||
    {};
    var CONFIG = {
      "hostname": "cuiqingcai.com",
      "root": "/",
      "scheme": "Pisces",
      "version": "7.8.0",
      "exturl": false,
      "sidebar":
      {
        "position": "right",
        "width": 360,
        "display": "post",
        "padding": 18,
        "offset": 12,
        "onmobile": false,
        "widgets": [
          {
            "type": "image",
            "name": "阿布云",
            "enable": false,
            "url": "https://www.abuyun.com/http-proxy/introduce.html",
            "src": "https://qiniu.cuiqingcai.com/88au8.jpg",
            "width": "100%"
      },
          {
            "type": "image",
            "name": "天验",
            "enable": true,
            "url": "https://tutorial.lengyue.video/?coupon=12ef4b1a-a3db-11ea-bb37-0242ac130002_cqx_850",
            "src": "https://qiniu.cuiqingcai.com/bco2a.png",
            "width": "100%"
      },
          {
            "type": "image",
            "name": "华为云",
            "enable": false,
            "url": "https://activity.huaweicloud.com/2020_618_promotion/index.html?bpName=5f9f98a29e2c40b780c1793086f29fe2&bindType=1&salesID=wangyubei",
            "src": "https://qiniu.cuiqingcai.com/y42ik.jpg",
            "width": "100%"
      },
          {
            "type": "image",
            "name": "张小鸡",
            "enable": false,
            "url": "http://www.zxiaoji.com/",
            "src": "https://qiniu.cuiqingcai.com/fm72f.png",
            "width": "100%"
      },
          {
            "type": "image",
            "name": "Luminati",
            "src": "https://qiniu.cuiqingcai.com/ikkq9.jpg",
            "url": "https://luminati-china.io/?affiliate=ref_5fbbaaa9647883f5c6f77095",
            "width": "100%",
            "enable": false
      },
          {
            "type": "image",
            "name": "IPIDEA",
            "url": "http://www.ipidea.net/?utm-source=cqc&utm-keyword=?cqc",
            "src": "https://qiniu.cuiqingcai.com/0ywun.png",
            "width": "100%",
            "enable": true
      },
          {
            "type": "tags",
            "name": "标签云",
            "enable": true
      },
          {
            "type": "categories",
            "name": "分类",
            "enable": true
      },
          {
            "type": "friends",
            "name": "友情链接",
            "enable": true
      },
          {
            "type": "hot",
            "name": "猜你喜欢",
            "enable": true
      }]
      },
      "copycode":
      {
        "enable": true,
        "show_result": true,
        "style": "mac"
      },
      "back2top":
      {
        "enable": true,
        "sidebar": false,
        "scrollpercent": true
      },
      "bookmark":
      {
        "enable": false,
        "color": "#222",
        "save": "auto"
      },
      "fancybox": false,
      "mediumzoom": false,
      "lazyload": false,
      "pangu": true,
      "comments":
      {
        "style": "tabs",
        "active": "gitalk",
        "storage": true,
        "lazyload": false,
        "nav": null,
        "activeClass": "gitalk"
      },
      "algolia":
      {
        "hits":
        {
          "per_page": 10
        },
        "labels":
        {
          "input_placeholder": "Search for Posts",
          "hits_empty": "We didn't find any results for the search: ${query}",
          "hits_stats": "${hits} results found in ${time} ms"
        }
      },
      "localsearch":
      {
        "enable": true,
        "trigger": "auto",
        "top_n_per_article": 10,
        "unescape": false,
        "preload": false
      },
      "motion":
      {
        "enable": false,
        "async": false,
        "transition":
        {
          "post_block": "bounceDownIn",
          "post_header": "slideDownIn",
          "post_body": "slideDownIn",
          "coll_header": "slideLeftIn",
          "sidebar": "slideUpIn"
        }
      },
      "path": "search.xml"
    };

  </script>
  <meta name="description" content="本节中，我们利用requests库和正则表达式来抓取猫眼电影TOP100的相关内容。requests比urllib使用更加方便，而且目前我们还没有系统学习HTML解析库，所以这里就选用正则表达式来作为解析工具。 1. 本节目标 本节中，我们要提取出猫眼电影TOP100的电影名称、时间、评分、图片等信息，提取的站点URL为http:&#x2F;&#x2F;maoyan.com&#x2F;board&#x2F;4，提取的结果会以文件形式保存">
  <meta property="og:type" content="article">
  <meta property="og:title" content="[Python3网络爬虫开发实战] 3.4-抓取猫眼电影排行">
  <meta property="og:url" content="https://cuiqingcai.com/5534.html">
  <meta property="og:site_name" content="静觅">
  <meta property="og:description" content="本节中，我们利用requests库和正则表达式来抓取猫眼电影TOP100的相关内容。requests比urllib使用更加方便，而且目前我们还没有系统学习HTML解析库，所以这里就选用正则表达式来作为解析工具。 1. 本节目标 本节中，我们要提取出猫眼电影TOP100的电影名称、时间、评分、图片等信息，提取的站点URL为http:&#x2F;&#x2F;maoyan.com&#x2F;board&#x2F;4，提取的结果会以文件形式保存">
  <meta property="og:locale" content="zh_CN">
  <meta property="og:image" content="https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-11.jpg">
  <meta property="og:image" content="https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-12.jpg">
  <meta property="og:image" content="https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-13.jpg">
  <meta property="og:image" content="https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-14.jpg">
  <meta property="og:image" content="https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-15.jpg">
  <meta property="article:published_time" content="2018-01-27T06:40:15.000Z">
  <meta property="article:modified_time" content="2021-12-18T13:11:11.557Z">
  <meta property="article:author" content="崔庆才">
  <meta property="article:tag" content="python">
  <meta name="twitter:card" content="summary">
  <meta name="twitter:image" content="https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-11.jpg">
  <link rel="canonical" href="https://cuiqingcai.com/5534.html">
  <script id="page-configurations">
    // https://hexo.io/docs/variables.html
    CONFIG.page = {
      sidebar: "",
      isHome: false,
      isPost: true,
      lang: 'zh-CN'
    };

  </script>
  <title>[Python3网络爬虫开发实战] 3.4-抓取猫眼电影排行 | 静觅</title>
  <meta name="google-site-verification" content="p_bIcnvirkFzG2dYKuNDivKD8-STet5W7D-01woA2fc" />
  <noscript>
    <style>
      .use-motion .brand,
      .use-motion .menu-item,
      .sidebar-inner,
      .use-motion .post-block,
      .use-motion .pagination,
      .use-motion .comments,
      .use-motion .post-header,
      .use-motion .post-body,
      .use-motion .collection-header
      {
        opacity: initial;
      }

      .use-motion .site-title,
      .use-motion .site-subtitle
      {
        opacity: initial;
        top: initial;
      }

      .use-motion .logo-line-before i
      {
        left: initial;
      }

      .use-motion .logo-line-after i
      {
        right: initial;
      }

    </style>
  </noscript>
  <link rel="alternate" href="/atom.xml" title="静觅" type="application/atom+xml">
</head>

<body itemscope itemtype="http://schema.org/WebPage">
  <div class="container">
    <div class="headband"></div>
    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner">
        <div class="site-brand-container">
          <div class="site-nav-toggle">
            <div class="toggle" aria-label="切换导航栏">
              <span class="toggle-line toggle-line-first"></span>
              <span class="toggle-line toggle-line-middle"></span>
              <span class="toggle-line toggle-line-last"></span>
            </div>
          </div>
          <div class="site-meta">
            <a href="/" class="brand" rel="start">
              <span class="logo-line-before"><i></i></span>
              <h1 class="site-title">静觅 <span class="site-subtitle"> 崔庆才的个人站点 </span>
              </h1>
              <span class="logo-line-after"><i></i></span>
            </a>
          </div>
          <div class="site-nav-right">
            <div class="toggle popup-trigger">
              <i class="fa fa-search fa-fw fa-lg"></i>
            </div>
          </div>
        </div>
        <nav class="site-nav">
          <ul id="menu" class="main-menu menu">
            <li class="menu-item menu-item-home">
              <a href="/" rel="section">首页</a>
            </li>
            <li class="menu-item menu-item-archives">
              <a href="/archives/" rel="section">文章列表</a>
            </li>
            <li class="menu-item menu-item-tags">
              <a href="/tags/" rel="section">文章标签</a>
            </li>
            <li class="menu-item menu-item-categories">
              <a href="/categories/" rel="section">文章分类</a>
            </li>
            <li class="menu-item menu-item-about">
              <a href="/about/" rel="section">关于博主</a>
            </li>
            <li class="menu-item menu-item-message">
              <a href="/message/" rel="section">给我留言</a>
            </li>
            <li class="menu-item menu-item-search">
              <a role="button" class="popup-trigger">搜索 </a>
            </li>
          </ul>
        </nav>
        <div class="search-pop-overlay">
          <div class="popup search-popup">
            <div class="search-header">
              <span class="search-icon">
                <i class="fa fa-search"></i>
              </span>
              <div class="search-input-container">
                <input autocomplete="off" autocapitalize="off" placeholder="搜索..." spellcheck="false" type="search" class="search-input">
              </div>
              <span class="popup-btn-close">
                <i class="fa fa-times-circle"></i>
              </span>
            </div>
            <div id="search-result">
              <div id="no-result">
                <i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>
              </div>
            </div>
          </div>
        </div>
      </div>
    </header>
    <div class="back-to-top">
      <i class="fa fa-arrow-up"></i>
      <span>0%</span>
    </div>
    <div class="reading-progress-bar"></div>
    <main class="main">
      <div class="main-inner">
        <div class="content-wrap">
          <div class="content post posts-expand">
            <article itemscope itemtype="http://schema.org/Article" class="post-block single" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/5534.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h1 class="post-title" itemprop="name headline"> [Python3网络爬虫开发实战] 3.4-抓取猫眼电影排行 </h1>
                <div class="post-meta">
                  <span class="post-meta-item">
                    <span class="post-meta-item-icon">
                      <i class="far fa-user"></i>
                    </span>
                    <span class="post-meta-item-text">作者</span>
                    <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                  </span>
                  <span class="post-meta-item">
                    <span class="post-meta-item-icon">
                      <i class="far fa-calendar"></i>
                    </span>
                    <span class="post-meta-item-text">发表于</span>
                    <time title="创建时间：2018-01-27 14:40:15" itemprop="dateCreated datePublished" datetime="2018-01-27T14:40:15+08:00">2018-01-27</time>
                  </span>
                  <span class="post-meta-item">
                    <span class="post-meta-item-icon">
                      <i class="far fa-folder"></i>
                    </span>
                    <span class="post-meta-item-text">分类于</span>
                    <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                      <a href="/categories/Python/" itemprop="url" rel="index"><span itemprop="name">Python</span></a>
                    </span>
                  </span>
                  <span id="/5534.html" class="post-meta-item leancloud_visitors" data-flag-title="[Python3网络爬虫开发实战] 3.4-抓取猫眼电影排行" title="阅读次数">
                    <span class="post-meta-item-icon">
                      <i class="fa fa-eye"></i>
                    </span>
                    <span class="post-meta-item-text">阅读次数：</span>
                    <span class="leancloud-visitors-count"></span>
                  </span>
                  <span class="post-meta-item" title="本文字数">
                    <span class="post-meta-item-icon">
                      <i class="far fa-file-word"></i>
                    </span>
                    <span class="post-meta-item-text">本文字数：</span>
                    <span>10k</span>
                  </span>
                  <span class="post-meta-item" title="阅读时长">
                    <span class="post-meta-item-icon">
                      <i class="far fa-clock"></i>
                    </span>
                    <span class="post-meta-item-text">阅读时长 &asymp;</span>
                    <span>9 分钟</span>
                  </span>
                </div>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="advertisements">
                  <div class="item">
                    <a href="http://i0k.cn/4UUsd" target="_blank">
                      <img src="https://qiniu.cuiqingcai.com/dsdhf.jpg">
                    </a>
                  </div>
                </div>
                <p>本节中，我们利用requests库和正则表达式来抓取猫眼电影TOP100的相关内容。requests比urllib使用更加方便，而且目前我们还没有系统学习HTML解析库，所以这里就选用正则表达式来作为解析工具。</p>
                <h2 id="1-本节目标"><a href="#1-本节目标" class="headerlink" title="1. 本节目标"></a>1. 本节目标</h2>
                <p>本节中，我们要提取出猫眼电影TOP100的电影名称、时间、评分、图片等信息，提取的站点URL为<a href="http://maoyan.com/board/4" target="_blank" rel="noopener">http://maoyan.com/board/4</a>，提取的结果会以文件形式保存下来。</p>
                <h2 id="2-准备工作"><a href="#2-准备工作" class="headerlink" title="2. 准备工作"></a>2. 准备工作</h2>
                <p>在本节开始之前，请确保已经正确安装好了requests库。如果没有安装，可以参考第1章的安装说明。</p>
                <h2 id="3-抓取分析"><a href="#3-抓取分析" class="headerlink" title="3. 抓取分析"></a>3. 抓取分析</h2>
                <p>我们需要抓取的目标站点为<a href="http://maoyan.com/board/4" target="_blank" rel="noopener">http://maoyan.com/board/4</a>，打开之后便可以查看到榜单信息，如图3-11所示。 <img src="https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-11.jpg" alt="">图3-11 榜单信息</p>
                <p>排名第一的电影是霸王别姬，页面中显示的有效信息有影片名称、主演、上映时间、上映地区、评分、图片等信息。</p>
                <p>将网页滚动到最下方，可以发现有分页的列表，直接点击第2页，观察页面的URL和内容发生了怎样的变化，如图3-12所示。</p>
                <p><img src="https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-12.jpg" alt="">图3-12 页面URL变化</p>
                <p>可以发现页面的URL变成<a href="http://maoyan.com/board/4?offset=10" target="_blank" rel="noopener">http://maoyan.com/board/4?offset=10</a>，比之前的URL多了一个参数，那就是<code>offset=10</code>，而目前显示的结果是排行11~20名的电影，初步推断这是一个偏移量的参数。再点击下一页，发现页面的URL变成了<a href="http://maoyan.com/board/4?offset=20" target="_blank" rel="noopener">http://maoyan.com/board/4?offset=20</a>，参数<code>offset</code>变成了20，而显示的结果是排行21~30的电影。</p>
                <p>由此可以总结出规律，<code>offset</code>代表偏移量值，如果偏移量为<code>n</code>，则显示的电影序号就是<code>n+1</code>到<code>n+10</code>，每页显示10个。所以，如果想获取TOP100电影，只需要分开请求10次，而10次的<code>offset</code>参数分别设置为0、10、20、…90即可，这样获取不同的页面之后，再用正则表达式提取出相关信息，就可以得到TOP100的所有电影信息了。</p>
                <h2 id="4-抓取首页"><a href="#4-抓取首页" class="headerlink" title="4. 抓取首页"></a>4. 抓取首页</h2>
                <p>接下来用代码实现这个过程。首先抓取第一页的内容。我们实现了<code>get_one_page()</code>方法，并给它传入<code>url</code>参数。然后将抓取的页面结果返回，再通过<code>main()</code>方法调用。初步代码实现如下：</p>
                <figure class="highlight python">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_one_page</span><span class="params">(url)</span>:</span></span><br><span class="line">    response = requests.get(url)</span><br><span class="line">    <span class="keyword">if</span> response.status_code == <span class="number">200</span>:</span><br><span class="line">        <span class="keyword">return</span> response.text</span><br><span class="line">    <span class="keyword">return</span> <span class="literal">None</span></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">main</span><span class="params">()</span>:</span></span><br><span class="line">    url = <span class="string">'http://maoyan.com/board/4'</span></span><br><span class="line">    html = get_one_page(url)</span><br><span class="line">    print(html)</span><br><span class="line"></span><br><span class="line">main()</span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>这样运行之后，就可以成功获取首页的源代码了。获取源代码后，就需要解析页面，提取出我们想要的信息。</p>
                <h2 id="5-正则提取"><a href="#5-正则提取" class="headerlink" title="5. 正则提取"></a>5. 正则提取</h2>
                <p>接下来，回到网页看一下页面的真实源码。在开发者模式下的Network监听组件中查看源代码，如图3-13所示。</p>
                <p><img src="https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-13.jpg" alt="">图3-13 源代码</p>
                <p>注意，这里不要在Elements选项卡中直接查看源码，因为那里的源码可能经过JavaScript操作而与原始请求不同，而是需要从Network选项卡部分查看原始请求得到的源码。</p>
                <p>查看其中一个条目的源代码，如图3-14所示。</p>
                <p><img src="https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-14.jpg" alt="">图3-14 源代码</p>
                <p>可以看到，一部电影信息对应的源代码是一个<code>dd</code>节点，我们用正则表达式来提取这里面的一些电影信息。首先，需要提取它的排名信息。而它的排名信息是在<code>class</code>为<code>board-index</code>的<code>i</code>节点内，这里利用非贪婪匹配来提取<code>i</code>节点内的信息，正则表达式写为：</p>
                <figure class="highlight xml">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line"><span class="tag">&lt;<span class="name">dd</span>&gt;</span>.*?board-index.*?&gt;(.*?)<span class="tag">&lt;/<span class="name">i</span>&gt;</span></span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>随后需要提取电影的图片。可以看到，后面有<code>a</code>节点，其内部有两个<code>img</code>节点。经过检查后发现，第二个<code>img</code>节点的<code>data-src</code>属性是图片的链接。这里提取第二个<code>img</code>节点的<code>data-src</code>属性，正则表达式可以改写如下：</p>
                <figure class="highlight dts">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line"><span class="params">&lt;dd&gt;</span>.*?board-index.*?&gt;(.*?)<span class="params">&lt;/i&gt;</span>.*?data-src=<span class="string">"(.*?)"</span></span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>再往后，需要提取电影的名称，它在后面的<code>p</code>节点内，<code>class</code>为<code>name</code>。所以，可以用<code>name</code>做一个标志位，然后进一步提取到其内<code>a</code>节点的正文内容，此时正则表达式改写如下：</p>
                <figure class="highlight livecodeserver">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line">&lt;dd&gt;.*?board-index.*<span class="meta">?&gt;</span>(.*?)&lt;/i&gt;.*?data-src=<span class="string">"(.*?)"</span>.*?name.*?<span class="keyword">a</span>.*<span class="meta">?&gt;</span>(.*?)&lt;/<span class="keyword">a</span>&gt;</span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>再提取主演、发布时间、评分等内容时，都是同样的原理。最后，正则表达式写为：</p>
                <figure class="highlight asciidoc">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line">&lt;dd&gt;.<span class="strong">*?board-index.*</span>?&gt;(.<span class="strong">*?)&lt;/i&gt;.*</span>?data-src="(.<span class="strong">*?)".*</span>?name.<span class="strong">*?a.*</span>?&gt;(.<span class="strong">*?)&lt;/a&gt;.*</span>?star.<span class="strong">*?&gt;(.*</span>?)&lt;/p&gt;.<span class="strong">*?releasetime.*</span>?&gt;(.<span class="strong">*?)&lt;/p&gt;.*</span>?integer.<span class="strong">*?&gt;(.*</span>?)&lt;/i&gt;.<span class="strong">*?fraction.*</span>?&gt;(.<span class="strong">*?)&lt;/i&gt;.*</span>?&lt;/dd&gt;</span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>这样一个正则表达式可以匹配一个电影的结果，里面匹配了7个信息。接下来，通过调用<code>findall()</code>方法提取出所有的内容。</p>
                <p>接下来，我们再定义解析页面的方法<code>parse_one_page()</code>，主要是通过正则表达式来从结果中提取出我们想要的内容，实现代码如下：</p>
                <figure class="highlight vbnet">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line">def parse_one_page(html):</span><br><span class="line">    pattern = re.compile(</span><br><span class="line">        <span class="comment">'<span class="doctag">&lt;dd&gt;</span>.*?board-index.*?&gt;(.*?)<span class="doctag">&lt;/i&gt;</span>.*?data-src="(.*?)".*?name.*?a.*?&gt;(.*?)<span class="doctag">&lt;/a&gt;</span>.*?star.*?&gt;(.*?)<span class="doctag">&lt;/p&gt;</span>.*?releasetime.*?&gt;(.*?)<span class="doctag">&lt;/p&gt;</span>.*?integer.*?&gt;(.*?)<span class="doctag">&lt;/i&gt;</span>.*?fraction.*?&gt;(.*?)<span class="doctag">&lt;/i&gt;</span>.*?<span class="doctag">&lt;/dd&gt;</span>',</span></span><br><span class="line">        re.S)</span><br><span class="line">    items = re.findall(pattern, html)</span><br><span class="line">    print(items)</span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>这样就可以成功地将一页的10个电影信息都提取出来，这是一个列表形式，输出结果如下：</p>
                <figure class="highlight scheme">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line">[(<span class="symbol">'1</span>', <span class="symbol">'http://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg@160w_220h_1e_1c</span>', <span class="symbol">'霸王别姬</span>', '\n                主演：张国荣,张丰毅,巩俐\n        ', <span class="symbol">'上映时间：1993-01-01</span>(<span class="name">中国香港</span>)', <span class="symbol">'9.</span>', <span class="symbol">'6</span>'), (<span class="symbol">'2</span>', <span class="symbol">'http://p0.meituan.net/movie/__40191813__4767047.jpg@160w_220h_1e_1c</span>', <span class="symbol">'肖申克的救赎</span>', '\n                主演：蒂姆·罗宾斯,摩根·弗里曼,鲍勃·冈顿\n        ', <span class="symbol">'上映时间：1994-10-14</span>(<span class="name">美国</span>)', <span class="symbol">'9.</span>', <span class="symbol">'5</span>'), (<span class="symbol">'3</span>', <span class="symbol">'http://p0.meituan.net/movie/fc9d78dd2ce84d20e53b6d1ae2eea4fb1515304.jpg@160w_220h_1e_1c</span>', <span class="symbol">'这个杀手不太冷</span>', '\n                主演：让·雷诺,加里·奥德曼,娜塔莉·波特曼\n        ', <span class="symbol">'上映时间：1994-09-14</span>(<span class="name">法国</span>)', <span class="symbol">'9.</span>', <span class="symbol">'5</span>'), (<span class="symbol">'4</span>', <span class="symbol">'http://p0.meituan.net/movie/23/6009725.jpg@160w_220h_1e_1c</span>', <span class="symbol">'罗马假日</span>', '\n                主演：格利高利·派克,奥黛丽·赫本,埃迪·艾伯特\n        ', <span class="symbol">'上映时间：1953-09-02</span>(<span class="name">美国</span>)', <span class="symbol">'9.</span>', <span class="symbol">'1</span>'), (<span class="symbol">'5</span>', <span class="symbol">'http://p0.meituan.net/movie/53/1541925.jpg@160w_220h_1e_1c</span>', <span class="symbol">'阿甘正传</span>', '\n                主演：汤姆·汉克斯,罗宾·怀特,加里·西尼斯\n        ', <span class="symbol">'上映时间：1994-07-06</span>(<span class="name">美国</span>)', <span class="symbol">'9.</span>', <span class="symbol">'4</span>'), (<span class="symbol">'6</span>', <span class="symbol">'http://p0.meituan.net/movie/11/324629.jpg@160w_220h_1e_1c</span>', <span class="symbol">'泰坦尼克号</span>', '\n                主演：莱昂纳多·迪卡普里奥,凯特·温丝莱特,比利·赞恩\n        ', <span class="symbol">'上映时间：1998-04-03</span>', <span class="symbol">'9.</span>', <span class="symbol">'5</span>'), (<span class="symbol">'7</span>', <span class="symbol">'http://p0.meituan.net/movie/99/678407.jpg@160w_220h_1e_1c</span>', <span class="symbol">'龙猫</span>', '\n                主演：日高法子,坂本千夏,糸井重里\n        ', <span class="symbol">'上映时间：1988-04-16</span>(<span class="name">日本</span>)', <span class="symbol">'9.</span>', <span class="symbol">'2</span>'), (<span class="symbol">'8</span>', <span class="symbol">'http://p0.meituan.net/movie/92/8212889.jpg@160w_220h_1e_1c</span>', <span class="symbol">'教父</span>', '\n                主演：马龙·白兰度,阿尔·帕西诺,詹姆斯·凯恩\n        ', <span class="symbol">'上映时间：1972-03-24</span>(<span class="name">美国</span>)', <span class="symbol">'9.</span>', <span class="symbol">'3</span>'), (<span class="symbol">'9</span>', <span class="symbol">'http://p0.meituan.net/movie/62/109878.jpg@160w_220h_1e_1c</span>', <span class="symbol">'唐伯虎点秋香</span>', '\n                主演：周星驰,巩俐,郑佩佩\n        ', <span class="symbol">'上映时间：1993-07-01</span>(<span class="name">中国香港</span>)', <span class="symbol">'9.</span>', <span class="symbol">'2</span>'), (<span class="symbol">'10</span>', <span class="symbol">'http://p0.meituan.net/movie/9bf7d7b81001a9cf8adbac5a7cf7d766132425.jpg@160w_220h_1e_1c</span>', <span class="symbol">'千与千寻</span>', '\n                主演：柊瑠美,入野自由,夏木真理\n        ', <span class="symbol">'上映时间：2001-07-20</span>(<span class="name">日本</span>)', <span class="symbol">'9.</span>', <span class="symbol">'3</span>')]</span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>但这样还不够，数据比较杂乱，我们再将匹配结果处理一下，遍历提取结果并生成字典，此时方法改写如下：</p>
                <figure class="highlight xquery">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line">def parse_one_page(html):</span><br><span class="line">    pattern = re.compile(</span><br><span class="line">        <span class="string">'&lt;dd&gt;.*?board-index.*?&gt;(.*?)&lt;/i&gt;.*?data-src="(.*?)".*?name.*?a.*?&gt;(.*?)&lt;/a&gt;.*?star.*?&gt;(.*?)&lt;/p&gt;.*?releasetime.*?&gt;(.*?)&lt;/p&gt;.*?integer.*?&gt;(.*?)&lt;/i&gt;.*?fraction.*?&gt;(.*?)&lt;/i&gt;.*?&lt;/dd&gt;'</span>,</span><br><span class="line">        re.S)</span><br><span class="line">    items = re.findall(pattern, html)</span><br><span class="line">    <span class="keyword">for</span> <span class="type">item</span> <span class="keyword">in</span> items:</span><br><span class="line">        yield &#123;</span><br><span class="line">            <span class="string">'index'</span>: <span class="type">item</span>[<span class="number">0</span>],</span><br><span class="line">            <span class="string">'image'</span>: <span class="type">item</span>[<span class="number">1</span>],</span><br><span class="line">            <span class="string">'title'</span>: <span class="type">item</span>[<span class="number">2</span>].<span class="keyword">strip</span>(),</span><br><span class="line">            <span class="string">'actor'</span>: <span class="type">item</span>[<span class="number">3</span>].<span class="keyword">strip</span>()[<span class="number">3</span>:] <span class="keyword">if</span> len(<span class="type">item</span>[<span class="number">3</span>]) &gt; <span class="number">3</span> <span class="keyword">else</span> <span class="string">''</span>,</span><br><span class="line">            <span class="string">'time'</span>: <span class="type">item</span>[<span class="number">4</span>].<span class="keyword">strip</span>()[<span class="number">5</span>:] <span class="keyword">if</span> len(<span class="type">item</span>[<span class="number">4</span>]) &gt; <span class="number">5</span> <span class="keyword">else</span> <span class="string">''</span>,</span><br><span class="line">            <span class="string">'score'</span>: <span class="type">item</span>[<span class="number">5</span>].<span class="keyword">strip</span>() + <span class="type">item</span>[<span class="number">6</span>].<span class="keyword">strip</span>()</span><br><span class="line">        &#125;</span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>这样就可以成功提取出电影的排名、图片、标题、演员、时间、评分等内容了，并把它赋值为一个个的字典，形成结构化数据。运行结果如下：</p>
                <figure class="highlight 1c">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line">&#123;'image': 'http://p1.meituan.net/movie/<span class="number">2080</span>3f<span class="number">5929</span>1c47e1e116c<span class="number">1196</span>3ce019e<span class="number">6871</span>1.jpg@160w_220h_1e_1c', 'actor': '张国荣,张丰毅,巩俐', 'score': '9.6', 'index': '1', 'title': '霸王别姬', 'time': '<span class="number">1993-01-01</span>(中国香港)'&#125;</span><br><span class="line">&#123;'image': 'http://p0.meituan.net/movie/__<span class="number">40191813</span>__<span class="number">476704</span>7.jpg@160w_220h_1e_1c', 'actor': '蒂姆·罗宾斯,摩根·弗里曼,鲍勃·冈顿', 'score': '9.5', 'index': '2', 'title': '肖申克的救赎', 'time': '<span class="number">1994-10-14</span>(美国)'&#125;</span><br><span class="line">&#123;'image': 'http://p0.meituan.net/movie/fc9d78dd2ce84d20e53b6d1ae2eea4fb<span class="number">151530</span>4.jpg@160w_220h_1e_1c', 'actor': '让·雷诺,加里·奥德曼,娜塔莉·波特曼', 'score': '9.5', 'index': '3', 'title': '这个杀手不太冷', 'time': '<span class="number">1994-09-14</span>(法国)'&#125;</span><br><span class="line">&#123;'image': 'http://p0.meituan.net/movie/23/<span class="number">600972</span>5.jpg@160w_220h_1e_1c', 'actor': '格利高利·派克,奥黛丽·赫本,埃迪·艾伯特', 'score': '9.1', 'index': '4', 'title': '罗马假日', 'time': '<span class="number">1953-09-02</span>(美国)'&#125;</span><br><span class="line">&#123;'image': 'http://p0.meituan.net/movie/53/<span class="number">154192</span>5.jpg@160w_220h_1e_1c', 'actor': '汤姆·汉克斯,罗宾·怀特,加里·西尼斯', 'score': '9.4', 'index': '5', 'title': '阿甘正传', 'time': '<span class="number">1994-07-06</span>(美国)'&#125;</span><br><span class="line">&#123;'image': 'http://p0.meituan.net/movie/11/<span class="number">324629</span>.jpg@160w_220h_1e_1c', 'actor': '莱昂纳多·迪卡普里奥,凯特·温丝莱特,比利·赞恩', 'score': '9.5', 'index': '6', 'title': '泰坦尼克号', 'time': '<span class="number">1998-04-03</span>'&#125;</span><br><span class="line">&#123;'image': 'http://p0.meituan.net/movie/99/<span class="number">678407</span>.jpg@160w_220h_1e_1c', 'actor': '日高法子,坂本千夏,糸井重里', 'score': '9.2', 'index': '7', 'title': '龙猫', 'time': '<span class="number">1988-04-16</span>(日本)'&#125;</span><br><span class="line">&#123;'image': 'http://p0.meituan.net/movie/92/<span class="number">821288</span>9.jpg@160w_220h_1e_1c', 'actor': '马龙·白兰度,阿尔·帕西诺,詹姆斯·凯恩', 'score': '9.3', 'index': '8', 'title': '教父', 'time': '<span class="number">1972-03-24</span>(美国)'&#125;</span><br><span class="line">&#123;'image': 'http://p0.meituan.net/movie/62/<span class="number">109878</span>.jpg@160w_220h_1e_1c', 'actor': '周星驰,巩俐,郑佩佩', 'score': '9.2', 'index': '9', 'title': '唐伯虎点秋香', 'time': '<span class="number">1993-07-01</span>(中国香港)'&#125;</span><br><span class="line">&#123;'image': 'http://p0.meituan.net/movie/9bf7d7b<span class="number">8100</span>1a9cf8adbac5a7cf7d<span class="number">76613242</span>5.jpg@160w_220h_1e_1c', 'actor': '柊瑠美,入野自由,夏木真理', 'score': '9.3', 'index': '10', 'title': '千与千寻', 'time': '<span class="number">2001-07-20</span>(日本)'&#125;</span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>到此为止，我们就成功提取了单页的电影信息。</p>
                <h2 id="6-写入文件"><a href="#6-写入文件" class="headerlink" title="6. 写入文件"></a>6. 写入文件</h2>
                <p>随后，我们将提取的结果写入文件，这里直接写入到一个文本文件中。这里通过JSON库的<code>dumps()</code>方法实现字典的序列化，并指定<code>ensure_ascii</code>参数为<code>False</code>，这样可以保证输出结果是中文形式而不是Unicode编码。代码如下：</p>
                <figure class="highlight python">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">write_to_json</span><span class="params">(content)</span>:</span></span><br><span class="line">    <span class="keyword">with</span> open(<span class="string">'result.txt'</span>, <span class="string">'a'</span>) <span class="keyword">as</span> f:</span><br><span class="line">        print(type(json.dumps(content)))</span><br><span class="line">        f.write(json.dumps(content, ensure_ascii=<span class="literal">False</span>,).encode(<span class="string">'utf-8'</span>))</span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>通过调用<code>write_to_json()</code>方法即可实现将字典写入到文本文件的过程，此处的<code>content</code>参数就是一部电影的提取结果，是一个字典。</p>
                <h2 id="7-整合代码"><a href="#7-整合代码" class="headerlink" title="7. 整合代码"></a>7. 整合代码</h2>
                <p>最后，实现<code>main()</code>方法来调用前面实现的方法，将单页的电影结果写入到文件。相关代码如下：</p>
                <figure class="highlight isbl">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line"><span class="variable">def</span> <span class="function"><span class="title">main</span>():</span></span><br><span class="line"><span class="function">    <span class="variable">url</span> = <span class="string">'http://maoyan.com/board/4'</span></span></span><br><span class="line"><span class="function">    <span class="variable">html</span> = <span class="title">get_one_page</span>(<span class="variable">url</span>)</span></span><br><span class="line">    <span class="variable">for</span> <span class="variable">item</span> <span class="variable"><span class="keyword">in</span></span> <span class="function"><span class="title">parse_one_page</span>(<span class="variable">html</span>):</span></span><br><span class="line"><span class="function">        <span class="title">write_to_json</span>(<span class="variable">item</span>)</span></span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>到此为止，我们就完成了单页电影的提取，也就是首页的10部电影可以成功提取并保存到文本文件中了。</p>
                <h2 id="8-分页爬取"><a href="#8-分页爬取" class="headerlink" title="8. 分页爬取"></a>8. 分页爬取</h2>
                <p>因为我们需要抓取的是TOP100的电影，所以还需要遍历一下，给这个链接传入<code>offset</code>参数，实现其他90部电影的爬取，此时添加如下调用即可：</p>
                <figure class="highlight isbl">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line"><span class="variable"><span class="keyword">if</span></span> <span class="variable">__name__</span> == <span class="string">'__main__'</span>:</span><br><span class="line">    <span class="variable">for</span> <span class="variable">i</span> <span class="variable"><span class="keyword">in</span></span> <span class="function"><span class="title">range</span>(<span class="number">10</span>):</span></span><br><span class="line"><span class="function">        <span class="title">main</span>(<span class="variable">offset</span>=<span class="variable">i</span> * <span class="number">10</span>)</span></span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>这里还需要将<code>main()</code>方法修改一下，接收一个<code>offset</code>值作为偏移量，然后构造URL进行爬取。实现代码如下：</p>
                <figure class="highlight isbl">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line"><span class="variable">def</span> <span class="function"><span class="title">main</span>(<span class="variable">offset</span>):</span></span><br><span class="line"><span class="function">    <span class="variable">url</span> = <span class="string">'http://maoyan.com/board/4?offset='</span> + <span class="title">str</span>(<span class="variable">offset</span>)</span></span><br><span class="line">    <span class="variable">html</span> = <span class="function"><span class="title">get_one_page</span>(<span class="variable">url</span>)</span></span><br><span class="line">    <span class="variable">for</span> <span class="variable">item</span> <span class="variable"><span class="keyword">in</span></span> <span class="function"><span class="title">parse_one_page</span>(<span class="variable">html</span>):</span></span><br><span class="line"><span class="function">        <span class="title">print</span>(<span class="variable">item</span>)</span></span><br><span class="line">        <span class="function"><span class="title">write_to_file</span>(<span class="variable">item</span>)</span></span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>到此为止，我们的猫眼电影TOP100的爬虫就全部完成了，再稍微整理一下，完整的代码如下：</p>
                <figure class="highlight python">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line"><span class="keyword">import</span> json</span><br><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"><span class="keyword">from</span> requests.exceptions <span class="keyword">import</span> RequestException</span><br><span class="line"><span class="keyword">import</span> re</span><br><span class="line"><span class="keyword">import</span> time</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_one_page</span><span class="params">(url)</span>:</span></span><br><span class="line">    <span class="keyword">try</span>:</span><br><span class="line">        response = requests.get(url)</span><br><span class="line">        <span class="keyword">if</span> response.status_code == <span class="number">200</span>:</span><br><span class="line">            <span class="keyword">return</span> response.text</span><br><span class="line">        <span class="keyword">return</span> <span class="literal">None</span></span><br><span class="line">    <span class="keyword">except</span> RequestException:</span><br><span class="line">        <span class="keyword">return</span> <span class="literal">None</span></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">parse_one_page</span><span class="params">(html)</span>:</span></span><br><span class="line">    pattern = re.compile(<span class="string">'&lt;dd&gt;.*?board-index.*?&gt;(\d+)&lt;/i&gt;.*?data-src="(.*?)".*?name"&gt;&lt;a'</span></span><br><span class="line">                         + <span class="string">'.*?&gt;(.*?)&lt;/a&gt;.*?star"&gt;(.*?)&lt;/p&gt;.*?releasetime"&gt;(.*?)&lt;/p&gt;'</span></span><br><span class="line">                         + <span class="string">'.*?integer"&gt;(.*?)&lt;/i&gt;.*?fraction"&gt;(.*?)&lt;/i&gt;.*?&lt;/dd&gt;'</span>, re.S)</span><br><span class="line">    items = re.findall(pattern, html)</span><br><span class="line">    <span class="keyword">for</span> item <span class="keyword">in</span> items:</span><br><span class="line">        <span class="keyword">yield</span> &#123;</span><br><span class="line">            <span class="string">'index'</span>: item[<span class="number">0</span>],</span><br><span class="line">            <span class="string">'image'</span>: item[<span class="number">1</span>],</span><br><span class="line">            <span class="string">'title'</span>: item[<span class="number">2</span>],</span><br><span class="line">            <span class="string">'actor'</span>: item[<span class="number">3</span>].strip()[<span class="number">3</span>:],</span><br><span class="line">            <span class="string">'time'</span>: item[<span class="number">4</span>].strip()[<span class="number">5</span>:],</span><br><span class="line">            <span class="string">'score'</span>: item[<span class="number">5</span>] + item[<span class="number">6</span>]</span><br><span class="line">        &#125;</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">write_to_file</span><span class="params">(content)</span>:</span></span><br><span class="line">    <span class="keyword">with</span> open(<span class="string">'result.txt'</span>, <span class="string">'a'</span>, encoding=<span class="string">'utf-8'</span>) <span class="keyword">as</span> f:</span><br><span class="line">        f.write(json.dumps(content, ensure_ascii=<span class="literal">False</span>) + <span class="string">'\n'</span>)</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">main</span><span class="params">(offset)</span>:</span></span><br><span class="line">    url = <span class="string">'http://maoyan.com/board/4?offset='</span> + str(offset)</span><br><span class="line">    html = get_one_page(url)</span><br><span class="line">    <span class="keyword">for</span> item <span class="keyword">in</span> parse_one_page(html):</span><br><span class="line">        print(item)</span><br><span class="line">        write_to_file(item)</span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">'__main__'</span>:</span><br><span class="line">    <span class="keyword">for</span> i <span class="keyword">in</span> range(<span class="number">10</span>):</span><br><span class="line">        main(offset=i * <span class="number">10</span>)</span><br><span class="line">        time.sleep(<span class="number">1</span>)</span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>现在猫眼多了反爬虫，如果速度过快，则会无响应，所以这里又增加了一个延时等待。</p>
                <h2 id="9-运行结果"><a href="#9-运行结果" class="headerlink" title="9. 运行结果"></a>9. 运行结果</h2>
                <p>最后，我们运行一下代码，输出结果类似如下：</p>
                <figure class="highlight 1c">
                  <table>
                    <tr>
                      <td class="gutter">
                        <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                      </td>
                      <td class="code">
                        <pre><span class="line">&#123;'index': '1', 'image': 'http://p1.meituan.net/movie/<span class="number">2080</span>3f<span class="number">5929</span>1c47e1e116c<span class="number">1196</span>3ce019e<span class="number">6871</span>1.jpg@160w_220h_1e_1c', 'title': '霸王别姬', 'actor': '张国荣,张丰毅,巩俐', 'time': '<span class="number">1993-01-01</span>(中国香港)', 'score': '9.6'&#125;</span><br><span class="line">&#123;'index': '2', 'image': 'http://p0.meituan.net/movie/__<span class="number">40191813</span>__<span class="number">476704</span>7.jpg@160w_220h_1e_1c', 'title': '肖申克的救赎', 'actor': '蒂姆·罗宾斯,摩根·弗里曼,鲍勃·冈顿', 'time': '<span class="number">1994-10-14</span>(美国)', 'score': '9.5'&#125;</span><br><span class="line">...</span><br><span class="line">&#123;'index': '98', 'image': 'http://p0.meituan.net/movie/76/<span class="number">707338</span>9.jpg@160w_220h_1e_1c', 'title': '东京物语', 'actor': '笠智众,原节子,杉村春子', 'time': '<span class="number">1953-11-03</span>(日本)', 'score': '9.1'&#125;</span><br><span class="line">&#123;'index': '99', 'image': 'http://p0.meituan.net/movie/52/<span class="number">342029</span>3.jpg@160w_220h_1e_1c', 'title': '我爱你', 'actor': '宋在河,李彩恩,吉海延', 'time': '<span class="number">2011-02-17</span>(韩国)', 'score': '9.0'&#125;</span><br><span class="line">&#123;'index': '100', 'image': 'http://p1.meituan.net/movie/__<span class="number">44335138</span>__<span class="number">847077</span>9.jpg@160w_220h_1e_1c', 'title': '迁徙的鸟', 'actor': '雅克·贝汉,菲利普·拉波洛,Philippe Labro', 'time': '<span class="number">2001-12-12</span>(法国)', 'score': '9.1'&#125;</span><br></pre>
                      </td>
                    </tr>
                  </table>
                </figure>
                <p>这里省略了中间的部分输出结果。可以看到，这样就成功地把TOP100的电影信息爬取下来了。</p>
                <p>这时我们再看下文本文件，结果如图3-15所示。</p>
                <p><img src="https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-15.jpg" alt="">图3-15 运行结果</p>
                <p>可以看到，电影信息也已全部保存到了文本文件中了，大功告成！</p>
                <h2 id="10-本节代码"><a href="#10-本节代码" class="headerlink" title="10. 本节代码"></a>10. 本节代码</h2>
                <p>本节的代码地址为<a href="https://github.com/Python3WebSpider/MaoYan" target="_blank" rel="noopener">https://github.com/Python3WebSpider/MaoYan</a>。</p>
                <p>本节中，我们通过爬取猫眼TOP100的电影信息练习了requests和正则表达式的用法。这是一个最基础的实例，希望大家可以通过这个实例对爬虫的实现有一个最基本的思路，也对这两个库的用法有更深一步的了解。</p>
              </div>
              <div class="popular-posts-header">相关文章</div>
              <ul class="popular-posts">
                <li class="popular-posts-item">
                  <div class="popular-posts-title"><a href="/5081.html" rel="bookmark">[Python3网络爬虫开发实战] 1.2-请求库的安装</a></div>
                </li>
                <li class="popular-posts-item">
                  <div class="popular-posts-title"><a href="/5132.html" rel="bookmark">[Python3网络爬虫开发实战] 1.2.1-Requests的安装</a></div>
                </li>
                <li class="popular-posts-item">
                  <div class="popular-posts-title"><a href="/5141.html" rel="bookmark">[Python3网络爬虫开发实战] 1.2.2-Selenium的安装</a></div>
                </li>
                <li class="popular-posts-item">
                  <div class="popular-posts-title"><a href="/5135.html" rel="bookmark">[Python3网络爬虫开发实战] 1.2.3-ChromeDriver的安装</a></div>
                </li>
                <li class="popular-posts-item">
                  <div class="popular-posts-title"><a href="/5153.html" rel="bookmark">[Python3网络爬虫开发实战] 1.2.4-GeckoDriver的安装</a></div>
                </li>
              </ul>
              <div class="reward-container">
                <div></div>
                <button onclick="var qr = document.getElementById('qr'); qr.style.display = (qr.style.display === 'none') ? 'block' : 'none';"> 打赏 </button>
                <div id="qr" style="display: none;">
                  <div style="display: inline-block;">
                    <img src="/images/wechatpay.jpg" alt="崔庆才 微信支付">
                    <p>微信支付</p>
                  </div>
                  <div style="display: inline-block;">
                    <img src="/images/alipay.jpg" alt="崔庆才 支付宝">
                    <p>支付宝</p>
                  </div>
                </div>
              </div>
              <footer class="post-footer">
                <div class="post-tags">
                  <a href="/tags/python/" rel="tag"><i class="fa fa-tag"></i> python</a>
                </div>
                <div class="post-nav">
                  <div class="post-nav-item">
                    <a href="/5530.html" rel="prev" title="[Python3网络爬虫开发实战] 3.3-正则表达式">
                      <i class="fa fa-chevron-left"></i> [Python3网络爬虫开发实战] 3.3-正则表达式 </a>
                  </div>
                  <div class="post-nav-item">
                    <a href="/5542.html" rel="next" title="[Python3网络爬虫开发实战] 4-解析库的使用"> [Python3网络爬虫开发实战] 4-解析库的使用 <i class="fa fa-chevron-right"></i>
                    </a>
                  </div>
                </div>
              </footer>
            </article>
          </div>
          <div class="comments" id="gitalk-container"></div>
          <script>
            window.addEventListener('tabs:register', () =>
            {
              let
              {
                activeClass
              } = CONFIG.comments;
              if (CONFIG.comments.storage)
              {
                activeClass = localStorage.getItem('comments_active') || activeClass;
              }
              if (activeClass)
              {
                let activeTab = document.querySelector(`a[href="#comment-${activeClass}"]`);
                if (activeTab)
                {
                  activeTab.click();
                }
              }
            });
            if (CONFIG.comments.storage)
            {
              window.addEventListener('tabs:click', event =>
              {
                if (!event.target.matches('.tabs-comment .tab-content .tab-pane')) return;
                let commentClass = event.target.classList[1];
                localStorage.setItem('comments_active', commentClass);
              });
            }

          </script>
        </div>
        <div class="toggle sidebar-toggle">
          <span class="toggle-line toggle-line-first"></span>
          <span class="toggle-line toggle-line-middle"></span>
          <span class="toggle-line toggle-line-last"></span>
        </div>
        <aside class="sidebar">
          <div class="sidebar-inner">
            <ul class="sidebar-nav motion-element">
              <li class="sidebar-nav-toc"> 文章目录 </li>
              <li class="sidebar-nav-overview"> 站点概览 </li>
            </ul>
            <!--noindex-->
            <div class="post-toc-wrap sidebar-panel">
              <div class="post-toc motion-element">
                <ol class="nav">
                  <li class="nav-item nav-level-2"><a class="nav-link" href="#1-本节目标"><span class="nav-number">1.</span> <span class="nav-text">1. 本节目标</span></a></li>
                  <li class="nav-item nav-level-2"><a class="nav-link" href="#2-准备工作"><span class="nav-number">2.</span> <span class="nav-text">2. 准备工作</span></a></li>
                  <li class="nav-item nav-level-2"><a class="nav-link" href="#3-抓取分析"><span class="nav-number">3.</span> <span class="nav-text">3. 抓取分析</span></a></li>
                  <li class="nav-item nav-level-2"><a class="nav-link" href="#4-抓取首页"><span class="nav-number">4.</span> <span class="nav-text">4. 抓取首页</span></a></li>
                  <li class="nav-item nav-level-2"><a class="nav-link" href="#5-正则提取"><span class="nav-number">5.</span> <span class="nav-text">5. 正则提取</span></a></li>
                  <li class="nav-item nav-level-2"><a class="nav-link" href="#6-写入文件"><span class="nav-number">6.</span> <span class="nav-text">6. 写入文件</span></a></li>
                  <li class="nav-item nav-level-2"><a class="nav-link" href="#7-整合代码"><span class="nav-number">7.</span> <span class="nav-text">7. 整合代码</span></a></li>
                  <li class="nav-item nav-level-2"><a class="nav-link" href="#8-分页爬取"><span class="nav-number">8.</span> <span class="nav-text">8. 分页爬取</span></a></li>
                  <li class="nav-item nav-level-2"><a class="nav-link" href="#9-运行结果"><span class="nav-number">9.</span> <span class="nav-text">9. 运行结果</span></a></li>
                  <li class="nav-item nav-level-2"><a class="nav-link" href="#10-本节代码"><span class="nav-number">10.</span> <span class="nav-text">10. 本节代码</span></a></li>
                </ol>
              </div>
            </div>
            <!--/noindex-->
            <div class="site-overview-wrap sidebar-panel">
              <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
                <img class="site-author-image" itemprop="image" alt="崔庆才" src="/images/avatar.png">
                <p class="site-author-name" itemprop="name">崔庆才</p>
                <div class="site-description" itemprop="description">崔庆才的个人站点，记录生活的瞬间，分享学习的心得。</div>
              </div>
              <div class="site-state-wrap motion-element">
                <nav class="site-state">
                  <div class="site-state-item site-state-posts">
                    <a href="/archives/">
                      <span class="site-state-item-count">608</span>
                      <span class="site-state-item-name">日志</span>
                    </a>
                  </div>
                  <div class="site-state-item site-state-categories">
                    <a href="/categories/">
                      <span class="site-state-item-count">24</span>
                      <span class="site-state-item-name">分类</span></a>
                  </div>
                  <div class="site-state-item site-state-tags">
                    <a href="/tags/">
                      <span class="site-state-item-count">156</span>
                      <span class="site-state-item-name">标签</span></a>
                  </div>
                </nav>
              </div>
              <div class="links-of-author motion-element">
                <span class="links-of-author-item">
                  <a href="https://github.com/Germey" title="GitHub → https:&#x2F;&#x2F;github.com&#x2F;Germey" rel="noopener" target="_blank"><i class="fab fa-github fa-fw"></i>GitHub</a>
                </span>
                <span class="links-of-author-item">
                  <a href="mailto:cqc@cuiqingcai.com.com" title="邮件 → mailto:cqc@cuiqingcai.com.com" rel="noopener" target="_blank"><i class="fa fa-envelope fa-fw"></i>邮件</a>
                </span>
                <span class="links-of-author-item">
                  <a href="https://weibo.com/cuiqingcai" title="微博 → https:&#x2F;&#x2F;weibo.com&#x2F;cuiqingcai" rel="noopener" target="_blank"><i class="fab fa-weibo fa-fw"></i>微博</a>
                </span>
                <span class="links-of-author-item">
                  <a href="https://www.zhihu.com/people/Germey" title="知乎 → https:&#x2F;&#x2F;www.zhihu.com&#x2F;people&#x2F;Germey" rel="noopener" target="_blank"><i class="fa fa-magic fa-fw"></i>知乎</a>
                </span>
              </div>
            </div>
            <div style=" width: 100%;" class="sidebar-panel sidebar-panel-image sidebar-panel-active">
              <a href="https://tutorial.lengyue.video/?coupon=12ef4b1a-a3db-11ea-bb37-0242ac130002_cqx_850" target="_blank" rel="noopener">
                <img src="https://qiniu.cuiqingcai.com/bco2a.png" style=" width: 100%;">
              </a>
            </div>
            <div style=" width: 100%;" class="sidebar-panel sidebar-panel-image sidebar-panel-active">
              <a href="http://www.ipidea.net/?utm-source=cqc&utm-keyword=?cqc" target="_blank" rel="noopener">
                <img src="https://qiniu.cuiqingcai.com/0ywun.png" style=" width: 100%;">
              </a>
            </div>
            <div class="sidebar-panel sidebar-panel-tags sidebar-panel-active">
              <h4 class="name"> 标签云 </h4>
              <div class="content">
                <a href="/tags/2048/" style="font-size: 10px;">2048</a> <a href="/tags/API/" style="font-size: 10px;">API</a> <a href="/tags/Bootstrap/" style="font-size: 11.25px;">Bootstrap</a> <a href="/tags/CDN/" style="font-size: 10px;">CDN</a> <a href="/tags/CQC/" style="font-size: 10px;">CQC</a> <a href="/tags/CSS/" style="font-size: 10px;">CSS</a> <a href="/tags/CSS-%E5%8F%8D%E7%88%AC%E8%99%AB/" style="font-size: 10px;">CSS 反爬虫</a> <a href="/tags/CV/" style="font-size: 10px;">CV</a> <a href="/tags/Django/" style="font-size: 10px;">Django</a> <a href="/tags/Eclipse/" style="font-size: 11.25px;">Eclipse</a> <a href="/tags/FTP/" style="font-size: 10px;">FTP</a> <a href="/tags/Git/" style="font-size: 10px;">Git</a> <a href="/tags/GitHub/" style="font-size: 13.75px;">GitHub</a> <a href="/tags/HTML5/" style="font-size: 10px;">HTML5</a> <a href="/tags/Hexo/" style="font-size: 10px;">Hexo</a> <a href="/tags/IT/" style="font-size: 10px;">IT</a> <a href="/tags/JSP/" style="font-size: 10px;">JSP</a> <a href="/tags/JavaScript/" style="font-size: 10px;">JavaScript</a> <a href="/tags/K8s/" style="font-size: 10px;">K8s</a> <a href="/tags/LOGO/" style="font-size: 10px;">LOGO</a> <a href="/tags/Linux/" style="font-size: 10px;">Linux</a> <a href="/tags/MIUI/" style="font-size: 10px;">MIUI</a> <a href="/tags/MongoDB/" style="font-size: 10px;">MongoDB</a> <a href="/tags/Mysql/" style="font-size: 10px;">Mysql</a> <a href="/tags/NBA/" style="font-size: 10px;">NBA</a> <a href="/tags/PHP/" style="font-size: 11.25px;">PHP</a> <a href="/tags/PS/" style="font-size: 10px;">PS</a> <a href="/tags/Pathlib/" style="font-size: 10px;">Pathlib</a> <a href="/tags/PhantomJS/" style="font-size: 10px;">PhantomJS</a> <a href="/tags/Python/" style="font-size: 15px;">Python</a> <a href="/tags/Python3/" style="font-size: 12.5px;">Python3</a> <a href="/tags/Pythonic/" style="font-size: 10px;">Pythonic</a> <a href="/tags/QQ/" style="font-size: 10px;">QQ</a> <a href="/tags/Redis/" style="font-size: 10px;">Redis</a> <a href="/tags/SAE/" style="font-size: 10px;">SAE</a> <a href="/tags/SSH/" style="font-size: 10px;">SSH</a> <a href="/tags/SVG/" style="font-size: 10px;">SVG</a> <a href="/tags/Scrapy/" style="font-size: 10px;">Scrapy</a> <a href="/tags/Scrapy-redis/" style="font-size: 10px;">Scrapy-redis</a> <a href="/tags/Scrapy%E5%88%86%E5%B8%83%E5%BC%8F/" style="font-size: 10px;">Scrapy分布式</a> <a href="/tags/Selenium/" style="font-size: 10px;">Selenium</a> <a href="/tags/TKE/" style="font-size: 10px;">TKE</a> <a href="/tags/Ubuntu/" style="font-size: 11.25px;">Ubuntu</a> <a href="/tags/VS-Code/" style="font-size: 10px;">VS Code</a> <a href="/tags/Vs-Code/" style="font-size: 10px;">Vs Code</a> <a href="/tags/Vue/" style="font-size: 11.25px;">Vue</a> <a href="/tags/Webpack/" style="font-size: 10px;">Webpack</a> <a href="/tags/Windows/" style="font-size: 10px;">Windows</a> <a href="/tags/Winpcap/" style="font-size: 10px;">Winpcap</a> <a href="/tags/WordPress/" style="font-size: 13.75px;">WordPress</a> <a href="/tags/Youtube/" style="font-size: 11.25px;">Youtube</a> <a href="/tags/android/" style="font-size: 10px;">android</a> <a href="/tags/ansible/" style="font-size: 10px;">ansible</a> <a href="/tags/cocos2d-x/" style="font-size: 10px;">cocos2d-x</a> <a href="/tags/e6/" style="font-size: 10px;">e6</a> <a href="/tags/fitvids/" style="font-size: 10px;">fitvids</a> <a href="/tags/git/" style="font-size: 11.25px;">git</a> <a href="/tags/json/" style="font-size: 10px;">json</a> <a href="/tags/js%E9%80%86%E5%90%91/" style="font-size: 10px;">js逆向</a> <a href="/tags/kubernetes/" style="font-size: 10px;">kubernetes</a> <a href="/tags/log/" style="font-size: 10px;">log</a> <a href="/tags/logging/" style="font-size: 10px;">logging</a> <a href="/tags/matlab/" style="font-size: 11.25px;">matlab</a> <a href="/tags/python/" style="font-size: 20px;">python</a> <a href="/tags/pytube/" style="font-size: 11.25px;">pytube</a> <a href="/tags/pywin32/" style="font-size: 10px;">pywin32</a> <a href="/tags/style/" style="font-size: 10px;">style</a> <a href="/tags/tomcat/" style="font-size: 10px;">tomcat</a> <a href="/tags/ubuntu/" style="font-size: 10px;">ubuntu</a> <a href="/tags/uwsgi/" style="font-size: 10px;">uwsgi</a> <a href="/tags/vsftpd/" style="font-size: 10px;">vsftpd</a> <a href="/tags/wamp/" style="font-size: 10px;">wamp</a> <a href="/tags/wineQQ/" style="font-size: 10px;">wineQQ</a> <a href="/tags/%E4%B8%83%E7%89%9B/" style="font-size: 11.25px;">七牛</a> <a href="/tags/%E4%B8%8A%E6%B5%B7/" style="font-size: 10px;">上海</a> <a href="/tags/%E4%B8%AA%E4%BA%BA%E7%BD%91%E7%AB%99/" style="font-size: 10px;">个人网站</a> <a href="/tags/%E4%B8%BB%E9%A2%98/" style="font-size: 10px;">主题</a> <a href="/tags/%E4%BA%91%E4%BA%A7%E5%93%81/" style="font-size: 10px;">云产品</a> <a href="/tags/%E4%BA%91%E5%AD%98%E5%82%A8/" style="font-size: 10px;">云存储</a> <a href="/tags/%E4%BA%AC%E4%B8%9C%E4%BA%91/" style="font-size: 10px;">京东云</a> <a href="/tags/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD/" style="font-size: 12.5px;">人工智能</a> <a href="/tags/%E4%BB%A3%E7%90%86/" style="font-size: 10px;">代理</a> <a href="/tags/%E4%BB%A3%E7%A0%81/" style="font-size: 10px;">代码</a> <a href="/tags/%E4%BB%A3%E7%A0%81%E5%88%86%E4%BA%AB%E5%9B%BE/" style="font-size: 10px;">代码分享图</a> <a href="/tags/%E4%BC%98%E5%8C%96/" style="font-size: 10px;">优化</a> <a href="/tags/%E4%BD%8D%E8%BF%90%E7%AE%97/" style="font-size: 10px;">位运算</a> <a href="/tags/%E5%85%AC%E4%BC%97%E5%8F%B7/" style="font-size: 10px;">公众号</a> <a href="/tags/%E5%88%86%E4%BA%AB/" style="font-size: 10px;">分享</a> <a href="/tags/%E5%88%86%E5%B8%83%E5%BC%8F/" style="font-size: 10px;">分布式</a> <a href="/tags/%E5%88%9B%E4%B8%9A/" style="font-size: 10px;">创业</a> <a href="/tags/%E5%89%8D%E7%AB%AF/" style="font-size: 12.5px;">前端</a> <a href="/tags/%E5%8D%9A%E5%AE%A2/" style="font-size: 10px;">博客</a> <a href="/tags/%E5%8E%9F%E7%94%9FAPP/" style="font-size: 10px;">原生APP</a> <a href="/tags/%E5%8F%8D%E7%88%AC%E8%99%AB/" style="font-size: 12.5px;">反爬虫</a> <a href="/tags/%E5%91%BD%E4%BB%A4/" style="font-size: 10px;">命令</a> <a href="/tags/%E5%93%8D%E5%BA%94%E5%BC%8F%E5%B8%83%E5%B1%80/" style="font-size: 10px;">响应式布局</a> <a href="/tags/%E5%9E%83%E5%9C%BE%E9%82%AE%E4%BB%B6/" style="font-size: 10px;">垃圾邮件</a> <a href="/tags/%E5%9F%9F%E5%90%8D%E7%BB%91%E5%AE%9A/" style="font-size: 10px;">域名绑定</a> <a href="/tags/%E5%A4%8D%E7%9B%98/" style="font-size: 10px;">复盘</a> <a href="/tags/%E5%A4%A7%E4%BC%97%E7%82%B9%E8%AF%84/" style="font-size: 10px;">大众点评</a> <a href="/tags/%E5%AD%97%E4%BD%93%E5%8F%8D%E7%88%AC%E8%99%AB/" style="font-size: 10px;">字体反爬虫</a> <a href="/tags/%E5%AD%97%E7%AC%A6%E9%97%AE%E9%A2%98/" style="font-size: 10px;">字符问题</a> <a href="/tags/%E5%AD%A6%E4%B9%A0%E6%96%B9%E6%B3%95/" style="font-size: 10px;">学习方法</a> <a href="/tags/%E5%AE%89%E5%8D%93/" style="font-size: 10px;">安卓</a> <a href="/tags/%E5%AE%9E%E7%94%A8/" style="font-size: 10px;">实用</a> <a href="/tags/%E5%B0%81%E9%9D%A2/" style="font-size: 10px;">封面</a> <a href="/tags/%E5%B4%94%E5%BA%86%E6%89%8D/" style="font-size: 18.75px;">崔庆才</a> <a href="/tags/%E5%B7%A5%E5%85%B7/" style="font-size: 12.5px;">工具</a> <a href="/tags/%E5%BC%80%E5%8F%91%E5%B7%A5%E5%85%B7/" style="font-size: 10px;">开发工具</a> <a href="/tags/%E5%BE%AE%E8%BD%AF/" style="font-size: 10px;">微软</a> <a href="/tags/%E6%80%9D%E8%80%83/" style="font-size: 10px;">思考</a> <a href="/tags/%E6%89%8B%E6%9C%BA%E8%AE%BF%E9%97%AE/" style="font-size: 10px;">手机访问</a> <a href="/tags/%E6%95%99%E7%A8%8B/" style="font-size: 10px;">教程</a> <a href="/tags/%E6%95%99%E8%82%B2/" style="font-size: 10px;">教育</a> <a href="/tags/%E6%96%B0%E4%B9%A6/" style="font-size: 12.5px;">新书</a> <a href="/tags/%E6%96%B9%E6%B3%95%E8%AE%BA/" style="font-size: 10px;">方法论</a> <a href="/tags/%E6%97%85%E6%B8%B8/" style="font-size: 10px;">旅游</a> <a href="/tags/%E6%97%A5%E5%BF%97/" style="font-size: 10px;">日志</a> <a href="/tags/%E6%9A%97%E6%97%B6%E9%97%B4/" style="font-size: 10px;">暗时间</a> <a href="/tags/%E6%9D%9C%E5%85%B0%E7%89%B9/" style="font-size: 11.25px;">杜兰特</a> <a href="/tags/%E6%A1%8C%E9%9D%A2/" style="font-size: 10px;">桌面</a> <a href="/tags/%E6%AD%8C%E5%8D%95/" style="font-size: 10px;">歌单</a> <a href="/tags/%E6%B1%9F%E5%8D%97/" style="font-size: 10px;">江南</a> <a href="/tags/%E6%B8%B8%E6%88%8F/" style="font-size: 10px;">游戏</a> <a href="/tags/%E7%84%A6%E8%99%91/" style="font-size: 10px;">焦虑</a> <a href="/tags/%E7%88%AC%E8%99%AB/" style="font-size: 16.25px;">爬虫</a> <a href="/tags/%E7%88%AC%E8%99%AB%E4%B9%A6%E7%B1%8D/" style="font-size: 11.25px;">爬虫书籍</a> <a href="/tags/%E7%8E%AF%E5%A2%83%E5%8F%98%E9%87%8F/" style="font-size: 10px;">环境变量</a> <a href="/tags/%E7%94%9F%E6%B4%BB%E7%AC%94%E8%AE%B0/" style="font-size: 10px;">生活笔记</a> <a href="/tags/%E7%99%BB%E5%BD%95/" style="font-size: 10px;">登录</a> <a href="/tags/%E7%9F%A5%E4%B9%8E/" style="font-size: 10px;">知乎</a> <a href="/tags/%E7%9F%AD%E4%BF%A1/" style="font-size: 10px;">短信</a> <a href="/tags/%E7%9F%AD%E4%BF%A1%E9%AA%8C%E8%AF%81%E7%A0%81/" style="font-size: 10px;">短信验证码</a> <a href="/tags/%E7%AC%94%E8%AE%B0%E8%BD%AF%E4%BB%B6/" style="font-size: 10px;">笔记软件</a> <a href="/tags/%E7%AF%AE%E7%BD%91/" style="font-size: 10px;">篮网</a> <a href="/tags/%E7%BA%B8%E5%BC%A0/" style="font-size: 10px;">纸张</a> <a href="/tags/%E7%BB%84%E4%BB%B6/" style="font-size: 10px;">组件</a> <a href="/tags/%E7%BD%91%E7%AB%99/" style="font-size: 10px;">网站</a> <a href="/tags/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/" style="font-size: 11.25px;">网络爬虫</a> <a href="/tags/%E7%BE%8E%E5%AD%A6/" style="font-size: 10px;">美学</a> <a href="/tags/%E8%82%89%E5%A4%B9%E9%A6%8D/" style="font-size: 10px;">肉夹馍</a> <a href="/tags/%E8%85%BE%E8%AE%AF%E4%BA%91/" style="font-size: 10px;">腾讯云</a> <a href="/tags/%E8%87%AA%E5%BE%8B/" style="font-size: 10px;">自律</a> <a href="/tags/%E8%A5%BF%E5%B0%91%E7%88%B7/" style="font-size: 10px;">西少爷</a> <a href="/tags/%E8%A7%86%E9%A2%91/" style="font-size: 10px;">视频</a> <a href="/tags/%E8%B0%B7%E6%AD%8C%E9%AA%8C%E8%AF%81%E7%A0%81/" style="font-size: 10px;">谷歌验证码</a> <a href="/tags/%E8%BF%90%E8%90%A5/" style="font-size: 10px;">运营</a> <a href="/tags/%E8%BF%9C%E7%A8%8B/" style="font-size: 10px;">远程</a> <a href="/tags/%E9%80%86%E5%90%91/" style="font-size: 10px;">逆向</a> <a href="/tags/%E9%85%8D%E7%BD%AE/" style="font-size: 10px;">配置</a> <a href="/tags/%E9%87%8D%E8%A3%85/" style="font-size: 10px;">重装</a> <a href="/tags/%E9%98%BF%E6%9D%9C/" style="font-size: 10px;">阿杜</a> <a href="/tags/%E9%9D%99%E8%A7%85/" style="font-size: 17.5px;">静觅</a> <a href="/tags/%E9%A2%A0%E8%A6%86/" style="font-size: 10px;">颠覆</a> <a href="/tags/%E9%A3%9E%E4%BF%A1/" style="font-size: 10px;">飞信</a> <a href="/tags/%E9%B8%BF%E8%92%99/" style="font-size: 10px;">鸿蒙</a>
              </div>
              <script>
                const tagsColors = ['#00a67c', '#5cb85c', '#d9534f', '#567e95', '#b37333', '#f4843d', '#15a287']
                const tagsElements = document.querySelectorAll('.sidebar-panel-tags .content a')
                tagsElements.forEach((item) =>
                {
                  item.style.backgroundColor = tagsColors[Math.floor(Math.random() * tagsColors.length)]
                })

              </script>
            </div>
            <div class="sidebar-panel sidebar-panel-categories sidebar-panel-active">
              <h4 class="name"> 分类 </h4>
              <div class="content">
                <ul class="category-list">
                  <li class="category-list-item"><a class="category-list-link" href="/categories/C-C/">C/C++</a><span class="category-list-count">23</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/HTML/">HTML</a><span class="category-list-count">14</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Java/">Java</a><span class="category-list-count">5</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/JavaScript/">JavaScript</a><span class="category-list-count">26</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Linux/">Linux</a><span class="category-list-count">15</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Markdown/">Markdown</a><span class="category-list-count">1</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Net/">Net</a><span class="category-list-count">4</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Other/">Other</a><span class="category-list-count">39</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/PHP/">PHP</a><span class="category-list-count">27</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Paper/">Paper</a><span class="category-list-count">2</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Python/">Python</a><span class="category-list-count">261</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/TypeScript/">TypeScript</a><span class="category-list-count">2</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E4%B8%AA%E4%BA%BA%E5%B1%95%E7%A4%BA/">个人展示</a><span class="category-list-count">1</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E4%B8%AA%E4%BA%BA%E6%97%A5%E8%AE%B0/">个人日记</a><span class="category-list-count">9</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E4%B8%AA%E4%BA%BA%E8%AE%B0%E5%BD%95/">个人记录</a><span class="category-list-count">4</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E4%B8%AA%E4%BA%BA%E9%9A%8F%E7%AC%94/">个人随笔</a><span class="category-list-count">15</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E5%AE%89%E8%A3%85%E9%85%8D%E7%BD%AE/">安装配置</a><span class="category-list-count">59</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E6%8A%80%E6%9C%AF%E6%9D%82%E8%B0%88/">技术杂谈</a><span class="category-list-count">88</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E6%9C%AA%E5%88%86%E7%B1%BB/">未分类</a><span class="category-list-count">1</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E7%94%9F%E6%B4%BB%E7%AC%94%E8%AE%B0/">生活笔记</a><span class="category-list-count">1</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E7%A6%8F%E5%88%A9%E4%B8%93%E5%8C%BA/">福利专区</a><span class="category-list-count">6</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E8%81%8C%E4%BD%8D%E6%8E%A8%E8%8D%90/">职位推荐</a><span class="category-list-count">2</span></li>
                </ul>
              </div>
            </div>
            <div class="sidebar-panel sidebar-panel-friends sidebar-panel-active">
              <h4 class="name"> 友情链接 </h4>
              <ul class="friends">
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/j2dub.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.findhao.net/" target="_blank" rel="noopener">FindHao</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/ou6mm.jpg">
                  </span>
                  <span class="link">
                    <a href="https://diygod.me/" target="_blank" rel="noopener">DIYgod</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/6apxu.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.51dev.com/" target="_blank" rel="noopener">IT技术社区</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://www.jankl.com/img/titleshu.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.jankl.com/" target="_blank" rel="noopener">liberalist</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/bqlbs.png">
                  </span>
                  <span class="link">
                    <a href="http://www.urselect.com/" target="_blank" rel="noopener">优社电商</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/8s88c.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.yuanrenxue.com/" target="_blank" rel="noopener">猿人学</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/2wgg5.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.yunlifang.cn/" target="_blank" rel="noopener">云立方</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/shwr6.png">
                  </span>
                  <span class="link">
                    <a href="http://lanbing510.info/" target="_blank" rel="noopener">冰蓝</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/blvoh.jpg">
                  </span>
                  <span class="link">
                    <a href="https://lengyue.me/" target="_blank" rel="noopener">冷月</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="http://qianxunclub.com/favicon.png">
                  </span>
                  <span class="link">
                    <a href="http://qianxunclub.com/" target="_blank" rel="noopener">千寻啊千寻</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/0044u.jpg">
                  </span>
                  <span class="link">
                    <a href="http://kodcloud.com/" target="_blank" rel="noopener">可道云</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/ygnpn.jpg">
                  </span>
                  <span class="link">
                    <a href="http://www.kunkundashen.cn/" target="_blank" rel="noopener">坤坤大神</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/22uv1.png">
                  </span>
                  <span class="link">
                    <a href="http://www.cenchong.com/" target="_blank" rel="noopener">岑冲博客</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/ev9kl.png">
                  </span>
                  <span class="link">
                    <a href="http://www.zxiaoji.com/" target="_blank" rel="noopener">张小鸡</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://www.503error.com/favicon.ico">
                  </span>
                  <span class="link">
                    <a href="https://www.503error.com/" target="_blank" rel="noopener">张志明个人博客</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/x714o.jpg">
                  </span>
                  <span class="link">
                    <a href="http://www.hubwiz.com/" target="_blank" rel="noopener">汇智网</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/129d8.png">
                  </span>
                  <span class="link">
                    <a href="https://www.bysocket.com/" target="_blank" rel="noopener">泥瓦匠BYSocket</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://www.xiongge.club/favicon.ico">
                  </span>
                  <span class="link">
                    <a href="https://www.xiongge.club/" target="_blank" rel="noopener">熊哥club</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/3w4fe.png">
                  </span>
                  <span class="link">
                    <a href="https://zerlong.com/" target="_blank" rel="noopener">知语</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/44hxf.png">
                  </span>
                  <span class="link">
                    <a href="http://redstonewill.com/" target="_blank" rel="noopener">红色石头</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/8g1fk.jpg">
                  </span>
                  <span class="link">
                    <a href="http://www.laodong.me/" target="_blank" rel="noopener">老董博客</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/wkaus.jpg">
                  </span>
                  <span class="link">
                    <a href="https://zhaoshuai.me/" target="_blank" rel="noopener">碎念</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/pgo0r.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.chenwenguan.com/" target="_blank" rel="noopener">陈文管的博客</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/kk82a.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.lxlinux.net/" target="_blank" rel="noopener">良许Linux教程网</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/lj0t2.jpg">
                  </span>
                  <span class="link">
                    <a href="https://tanqingbo.cn/" target="_blank" rel="noopener">IT码农</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/i8cdr.png">
                  </span>
                  <span class="link">
                    <a href="https://junyiseo.com/" target="_blank" rel="noopener">均益个人博客</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/chwv2.png">
                  </span>
                  <span class="link">
                    <a href="https://brucedone.com/" target="_blank" rel="noopener">大鱼的鱼塘</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/2y43o.png">
                  </span>
                  <span class="link">
                    <a href="http://bbs.nightteam.cn/" target="_blank" rel="noopener">夜幕爬虫安全论坛</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/zvc3w.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.weishidong.com/" target="_blank" rel="noopener">韦世东的技术专栏</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/ebudy.jpg">
                  </span>
                  <span class="link">
                    <a href="https://chuanjiabing.com/" target="_blank" rel="noopener">穿甲兵技术社区</a>
                  </span>
                </li>
              </ul>
            </div>
          </div>
        </aside>
        <div id="sidebar-dimmer"></div>
      </div>
    </main>
    <footer class="footer">
      <div class="footer-inner">
        <div class="copyright"> &copy; <span itemprop="copyrightYear">2021</span>
          <span class="with-love">
            <i class="fa fa-heart"></i>
          </span>
          <span class="author" itemprop="copyrightHolder">崔庆才丨静觅</span>
          <span class="post-meta-divider">|</span>
          <span class="post-meta-item-icon">
            <i class="fa fa-chart-area"></i>
          </span>
          <span title="站点总字数">2.6m</span>
          <span class="post-meta-divider">|</span>
          <span class="post-meta-item-icon">
            <i class="fa fa-coffee"></i>
          </span>
          <span title="站点阅读时长">39:54</span>
        </div>
        <div class="powered-by">由 <a href="https://hexo.io/" class="theme-link" rel="noopener" target="_blank">Hexo</a> & <a href="https://pisces.theme-next.org/" class="theme-link" rel="noopener" target="_blank">NexT.Pisces</a> 强力驱动 </div>
        <div class="beian"><a href="https://beian.miit.gov.cn/" rel="noopener" target="_blank">京ICP备18015597号-1 </a>
        </div>
        <script>
          (function ()
          {
            function leancloudSelector(url)
            {
              url = encodeURI(url);
              return document.getElementById(url).querySelector('.leancloud-visitors-count');
            }

            function addCount(Counter)
            {
              var visitors = document.querySelector('.leancloud_visitors');
              var url = decodeURI(visitors.id);
              var title = visitors.dataset.flagTitle;
              Counter('get', '/classes/Counter?where=' + encodeURIComponent(JSON.stringify(
              {
                url
              }))).then(response => response.json()).then((
              {
                results
              }) =>
              {
                if (results.length > 0)
                {
                  var counter = results[0];
                  leancloudSelector(url).innerText = counter.time + 1;
                  Counter('put', '/classes/Counter/' + counter.objectId,
                  {
                    time:
                    {
                      '__op': 'Increment',
                      'amount': 1
                    }
                  }).catch(error =>
                  {
                    console.error('Failed to save visitor count', error);
                  });
                }
                else
                {
                  Counter('post', '/classes/Counter',
                  {
                    title,
                    url,
                    time: 1
                  }).then(response => response.json()).then(() =>
                  {
                    leancloudSelector(url).innerText = 1;
                  }).catch(error =>
                  {
                    console.error('Failed to create', error);
                  });
                }
              }).catch(error =>
              {
                console.error('LeanCloud Counter Error', error);
              });
            }

            function showTime(Counter)
            {
              var visitors = document.querySelectorAll('.leancloud_visitors');
              var entries = [...visitors].map(element =>
              {
                return decodeURI(element.id);
              });
              Counter('get', '/classes/Counter?where=' + encodeURIComponent(JSON.stringify(
              {
                url:
                {
                  '$in': entries
                }
              }))).then(response => response.json()).then((
              {
                results
              }) =>
              {
                for (let url of entries)
                {
                  let target = results.find(item => item.url === url);
                  leancloudSelector(url).innerText = target ? target.time : 0;
                }
              }).catch(error =>
              {
                console.error('LeanCloud Counter Error', error);
              });
            }
            let
            {
              app_id,
              app_key,
              server_url
            } = {
              "enable": true,
              "app_id": "6X5dRQ0pnPWJgYy8SXOg0uID-gzGzoHsz",
              "app_key": "ziLDVEy73ne5HtFTiGstzHMS",
              "server_url": "https://6x5drq0p.lc-cn-n1-shared.com",
              "security": false
            };

            function fetchData(api_server)
            {
              var Counter = (method, url, data) =>
              {
                return fetch(`${api_server}/1.1${url}`,
                {
                  method,
                  headers:
                  {
                    'X-LC-Id': app_id,
                    'X-LC-Key': app_key,
                    'Content-Type': 'application/json',
                  },
                  body: JSON.stringify(data)
                });
              };
              if (CONFIG.page.isPost)
              {
                if (CONFIG.hostname !== location.hostname) return;
                addCount(Counter);
              }
              else if (document.querySelectorAll('.post-title-link').length >= 1)
              {
                showTime(Counter);
              }
            }
            let api_server = app_id.slice(-9) !== '-MdYXbMMI' ? server_url : `https://${app_id.slice(0, 8).toLowerCase()}.api.lncldglobal.com`;
            if (api_server)
            {
              fetchData(api_server);
            }
            else
            {
              fetch('https://app-router.leancloud.cn/2/route?appId=' + app_id).then(response => response.json()).then((
              {
                api_server
              }) =>
              {
                fetchData('https://' + api_server);
              });
            }
          })();

        </script>
      </div>
      <div class="footer-stat">
        <span id="cnzz_stat_icon_1279355174"></span>
        <script type="text/javascript">
          document.write(unescape("%3Cspan id='cnzz_stat_icon_1279355174'%3E%3C/span%3E%3Cscript src='https://v1.cnzz.com/z_stat.php%3Fid%3D1279355174%26online%3D1%26show%3Dline' type='text/javascript'%3E%3C/script%3E"));

        </script>
      </div>
    </footer>
  </div>
  <script src="//cdn.jsdelivr.net/npm/animejs@3.2.1/lib/anime.min.js"></script>
  <script src="//cdn.jsdelivr.net/npm/pangu@4/dist/browser/pangu.min.js"></script>
  <script src="/js/utils.js"></script>
  <script src="/.js"></script>
  <script src="/js/schemes/pisces.js"></script>
  <script src="/.js"></script>
  <script src="/js/next-boot.js"></script>
  <script src="/.js"></script>
  <script>
    (function ()
    {
      var canonicalURL, curProtocol;
      //Get the <link> tag
      var x = document.getElementsByTagName("link");
      //Find the last canonical URL
      if (x.length > 0)
      {
        for (i = 0; i < x.length; i++)
        {
          if (x[i].rel.toLowerCase() == 'canonical' && x[i].href)
          {
            canonicalURL = x[i].href;
          }
        }
      }
      //Get protocol
      if (!canonicalURL)
      {
        curProtocol = window.location.protocol.split(':')[0];
      }
      else
      {
        curProtocol = canonicalURL.split(':')[0];
      }
      //Get current URL if the canonical URL does not exist
      if (!canonicalURL) canonicalURL = window.location.href;
      //Assign script content. Replace current URL with the canonical URL
      ! function ()
      {
        var e = /([http|https]:\/\/[a-zA-Z0-9\_\.]+\.baidu\.com)/gi,
          r = canonicalURL,
          t = document.referrer;
        if (!e.test(r))
        {
          var n = (String(curProtocol).toLowerCase() === 'https') ? "https://sp0.baidu.com/9_Q4simg2RQJ8t7jm9iCKT-xh_/s.gif" : "//api.share.baidu.com/s.gif";
          t ? (n += "?r=" + encodeURIComponent(document.referrer), r && (n += "&l=" + r)) : r && (n += "?l=" + r);
          var i = new Image;
          i.src = n
        }
      }(window);
    })();

  </script>
  <script src="/js/local-search.js"></script>
  <script src="/.js"></script>
  <link rel="stylesheet" href="//cdn.jsdelivr.net/npm/gitalk@1/dist/gitalk.min.css">
  <script>
    NexT.utils.loadComments(document.querySelector('#gitalk-container'), () =>
    {
      NexT.utils.getScript('//cdn.jsdelivr.net/npm/gitalk@1/dist/gitalk.min.js', () =>
      {
        var gitalk = new Gitalk(
        {
          clientID: '4c86ce1d7c4fbb3b277c',
          clientSecret: '4927beb0f90e2c07e66c99d9d2529cf3eb8ac8e4',
          repo: 'Blog',
          owner: 'germey',
          admin: ['germey'],
          id: '43a1dfaddd48eb447b51f7681efe0401',
          language: 'zh-CN',
          distractionFreeMode: true
        });
        gitalk.render('gitalk-container');
      }, window.Gitalk);
    });

  </script>
</body>

</html>
